package com.kdtech.analyse.blog;
import com.kdtech.crawler.CrawlHTML;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;


import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.StringUtils;
import com.kdtech.analyse.AnalyseNews;

/**
 *
 * 教育人博客 解析类
 *
 * @author Chase  E-mail:594126573@qq.com
 *
 * @version 创建时间：2012-12-3  上午10:55:37
 */
public class EduBlogAnalyse implements AnalyseNews{

	
	public boolean isDetailPage(String url) {
		boolean bRet = false;
		String[] regex = {
				"http://.*[0-9]*.blog.edu.cn/home.php[?]mod=space&uid=[0-9]*&do=blog&id=[0-9]*",
				"http://[a-z]*[0-9]*.blog.edu.cn/home.php[?]mod=space&uid=[0-9]*&do=blog&id=[0-9]*"
				};
		//http://ying780407.blog.edu.cn/home.php?mod=space&uid=1572569&do=blog&id=735710
		for (int i = 0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				return true;
			}
		}
		return bRet;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		String url = urlMeta.getUrl();
		if(!isDetailPage(url)){
		}
		String html = urlMeta.getHtml();
		if (html== null) {
		}
		NewsMeta blog = new NewsMeta();
		blog.setUrl(url);
		String title = null;
		String content = null;
		Long date = null;
		Integer deliverNum=0;
		String author=null;

		Document doc = Jsoup.parse(html);
		title  = doc.select("h1").text();
		date = DateUtils.matchDate(doc.select("span.xg1").text());
		content = HtmlCleaner.getContentHtml(url, doc.select("div#blog_article"));
		author = doc.select("h2").text();

		blog.setTitle(title);
		blog.setContent(content);
		blog.setDate(date);
		//blog.setDeliverNum(deliverNum);
		blog.setAuthor(author);
		String updateUrl = null;
		String updateUrlId = null;
		String blogName = null;
		if (url.indexOf(".") != -1) {
			blogName = url.substring(0, url.indexOf("."));
			if (blogName.indexOf("http://") != -1) {
				blogName = blogName.substring("http://".length());
			}
		}
		if (url.indexOf("/") != -1) {
			String[] split = url.split("/");
			if (split.length>0) {
				updateUrlId = split[split.length-1];
				if (updateUrlId.indexOf(".") != -1) {
					updateUrlId = updateUrlId.substring(0, updateUrlId.indexOf("."));
					updateUrl = "http://"+blogName+".blog.edu.cn/count/"+updateUrlId+"count.html";
				}
 			}
		}

		if (updateUrl != null) {
			blog.setUpdateUrl(updateUrl);
			NewsMeta update = Update(blog);
			return update;
		}

		return blog;
	}

	
	public NewsMeta Update(NewsMeta meta) {
		if (meta!=null &&StringUtils.isNotBlank(meta.getUpdateUrl())){
			UrlMeta urlMeta = CrawlHTML.responseToURL(meta.getUpdateUrl());
			if (urlMeta!=null && urlMeta.getHtml()!=null){
				return parserHtml(urlMeta);
			}
		}
		return null;
	}


	public static void main(String[] args) {
		String url = "http://tajy2014.blog.edu.cn/home.php?mod=space&uid=5699073&do=blog&id=703677";
		EduBlogAnalyse test = new EduBlogAnalyse();
		if(test.isDetailPage(url)){
			UrlMeta responseToURL = CrawlHTML.responseToURL(url);
			NewsMeta parserHtml = test.parserHtml(responseToURL);
			System.out.println(parserHtml);
		}else{
			System.out.println("不符合规则");
		}
	}

	
}
